/*============================================================================
  File:     Uneven distribution (AdventureWorksDW2008_ModifiedSalesKey).sql

  Summary:  Skewed distribution across multiple columns - what happens?

  SQL Server Version: 2008+
------------------------------------------------------------------------------
  Written by Kimberly L. Tripp & Paul S. Randal, SQLskills.com

  For more scripts and sample code, check out http://www.SQLskills.com

  This script is intended as a supplement to the SQL Server 2008 Jumpstart or
  Metro training.
  
  THIS CODE AND INFORMATION ARE PROVIDED "AS IS" WITHOUT WARRANTY OF 
  ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED 
  TO THE IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A
  PARTICULAR PURPOSE.
============================================================================*/

-- Here's the dropbox link to the backup of this database
-- https://www.dropbox.com/sh/wbvcjsdnbj7hcw6/AAB6LRvEyghxn9qZv0zDI0gPa?dl=1

USE [AdventureWorksDW2008_ModifiedSalesKey];
go

-- Clean up?
--sp_SQLskills_helpindex 'FactInternetSales'
--exec sp_helpstats 'FactInternetSales', 'all'
go

--DROP INDEX [FactInternetSales].ShipDateOrderDateInd
--DROP INDEX [FactInternetSales].ShipDateOrderDateInd_SeekableForMin
--DROP STATISTICS factinternetsales.[SalesByCustomer_R11000-12000]
--DROP STATISTICS factinternetsales.[SalesByCustomer_11142-11185]
--DROP STATISTICS factinternetsales.[_WA_Sys_00000008_1273C1CD]
--DROP STATISTICS factinternetsales.[_WA_Sys_0000000A_1273C1CD]
GO

--EXEC [sp_SQLskills_DropAllColumnStats]	
--		'dbo', 'factinternetsales', 'customerkey', 'TRUE';
GO

--sp_SQLskills_helpindex 'FactInternetSales'
--exec sp_helpstats 'FactInternetSales', 'all'
go

-- Check the indexes
-- You can get my version of sp_helpindex 
-- here: https://www.sqlskills.com/blogs/kimberly/category/sp_helpindex-rewrites/
EXEC sp_SQLskills_helpindex 'FactInternetSales';
go

--DBCC SHOW_STATISTICS(factinternetsales, IX_FactInternetSales_OrderDateKey)
DBCC SHOW_STATISTICS(factinternetsales, IX_FactIneternetSales_ShipDateKey);
go

-- nulls?
SELECT 146887.00/30923776;
SELECT 30923776.00/146887;
go

SELECT TOP 100 * 
FROM [dbo].[FactInternetSales] AS [fis]
ORDER BY [fis].[OrderDateKey] DESC
go

select min(OrderDateKey) FROM [dbo].[FactInternetSales] 
select max(OrderDateKey) FROM [dbo].[FactInternetSales] 

------------------------------------------------------------------------------
-- Look at what happens during optimization with uneven distribution
------------------------------------------------------------------------------

SET STATISTICS IO ON;
SET STATISTICS TIME ON;
GO

-- Find the oldest unshipped item with a table scan:
SELECT MIN([fis].[OrderDateKey])
FROM [dbo].[FactInternetSales] AS [fis] WITH (INDEX (0))
WHERE [fis].[ShipDateKey] IS NULL;
go

-- Find the oldest unshipped item:
SELECT MIN([fis].[OrderDateKey])
FROM [dbo].[FactInternetSales] AS [fis]
WHERE [fis].[ShipDateKey] IS NULL;
go


-- Force the Shipped date index lookups
SELECT MIN([fis].[OrderDateKey])
FROM [dbo].[FactInternetSales] AS [fis]
	WITH (INDEX([IX_FactIneternetSales_ShipDateKey]))
WHERE [fis].[ShipDateKey] IS NULL;
go



------------------------------------------------------------------------------
-- Where did this estimate come from?
------------------------------------------------------------------------------
-- select COUNT(*) FROM FactInternetSales
-- Total rows = 30,923,776

 --SELECT COUNT(*) FROM FactInternetSales
 --WHERE ShipDateKey IS NULL
--Total rows where shipped date is null = 146,887

-- SELECT 30923776/146887 = 210 --(they think they'll find one within ~210 rows)
-- However, they're horribly wrong as the unshipped items are all at the end of the table. Instead of finding the row
-- within only 24 (one in 24 is NULL) they don't encounter a NULL until the end.


------------------------------------------------------------------------------
-- Cold cache numbers and performance
------------------------------------------------------------------------------

-- Let's look at the cold cache numbers
DBCC DROPCLEANBUFFERS;
go

-- Table scan
DECLARE @StartTime  datetime2;
SELECT @StartTime = SYSDATETIME();
    SELECT MIN([fis].[OrderDateKey])
    FROM [dbo].[FactInternetSales] AS [fis] WITH (INDEX(0))
    WHERE [fis].[ShipDateKey] IS NULL;
SELECT [Total Time: Table Scan] = DATEDIFF(MS, @StartTime, SYSDATETIME());
go 

DBCC DROPCLEANBUFFERS;
go

-- No hints
DECLARE @StartTime  datetime2;
SELECT @StartTime = SYSDATETIME();
    SELECT MIN([fis].[OrderDateKey])
    FROM [dbo].[FactInternetSales] AS [fis]
    WHERE [fis].[ShipDateKey] IS NULL;
SELECT [Total Time: Lookup for NULL] = DATEDIFF(MS, @StartTime, SYSDATETIME());
go 

DBCC DROPCLEANBUFFERS;
go

-- Force the shipped date
DECLARE @StartTime  datetime2;
SELECT @StartTime = SYSDATETIME();
    SELECT MIN([fis].[OrderDateKey])
    FROM [dbo].[FactInternetSales] AS [fis] 
		WITH (INDEX([IX_FactIneternetSales_ShipDateKey]))
    WHERE [fis].[ShipDateKey] IS NULL;
SELECT [Total Time: All NULLs plus temp table] = DATEDIFF(MS, @StartTime, SYSDATETIME());
go 


------------------------------------------------------------------------------
-- What if we had a better index that correlated the columns
------------------------------------------------------------------------------

-- Finally, what if we had a better index that understands the 
-- correlation between the columns?

-- recommended through the Missing Index DMVs (through showplan)
CREATE NONCLUSTERED INDEX [ShipDateOrderDateInd]
ON [dbo].[FactInternetSales] ([ShipDateKey])
INCLUDE ([OrderDateKey]);
GO

-- my recommendation is to add the order date to the key so that the order date is ordered
-- min is the first record on the first page
CREATE INDEX [ShipDateOrderDateInd_SeekableForMin]
ON [dbo].[FactInternetSales] ([ShipDateKey], [OrderDateKey]);
go

------------------------------------------
-- Execute everything in this section
-- to compare the costs
------------------------------------------

SET STATISTICS TIME ON
go

DBCC DROPCLEANBUFFERS;
go

DECLARE @StartTime  datetime2;
SELECT @StartTime = SYSDATETIME();
    SELECT MIN([fis].[OrderDateKey])
    FROM [dbo].[FactInternetSales] AS [fis] WITH (INDEX(0))
    WHERE [fis].[ShipDateKey] IS NULL;
SELECT [Total Time: Table Scan] = DATEDIFF(MS, @StartTime, SYSDATETIME());
go 

DBCC DROPCLEANBUFFERS;
go

DECLARE @StartTime  datetime2;
SELECT @StartTime = SYSDATETIME();
    SELECT MIN([fis].[OrderDateKey])
    FROM [dbo].[FactInternetSales] AS [fis] WITH (INDEX ([IX_FactInternetSales_OrderDateKey]))
    WHERE [fis].[ShipDateKey] IS NULL;
SELECT [Total Time: OrderDateKey] = DATEDIFF(MS, @StartTime, SYSDATETIME());
go 

DBCC DROPCLEANBUFFERS;
go

DECLARE @StartTime  datetime2;
SELECT @StartTime = SYSDATETIME();
    SELECT MIN([fis].[OrderDateKey])
    FROM [dbo].[FactInternetSales] AS [fis] WITH (INDEX([IX_FactIneternetSales_ShipDateKey]))
    WHERE [fis].[ShipDateKey] IS NULL;
SELECT [Total Time: ShipDateKey] = DATEDIFF(MS, @StartTime, SYSDATETIME());
go 

DBCC DROPCLEANBUFFERS;
go

DECLARE @StartTime  datetime2;
SELECT @StartTime = SYSDATETIME();
    SELECT MIN([fis].[OrderDateKey])
    FROM [dbo].[FactInternetSales] AS [fis] WITH (INDEX ([ShipDateOrderDateInd]))
    WHERE [fis].[ShipDateKey] IS NULL;
SELECT [Total Time: ShipDate + INCLUDE OrderDate Key] = DATEDIFF(MS, @StartTime, SYSDATETIME());
go 

DBCC DROPCLEANBUFFERS;
go

DECLARE @StartTime  datetime2;
SELECT @StartTime = SYSDATETIME();
    SELECT MIN([fis].[OrderDateKey])
    FROM [dbo].[FactInternetSales] AS [fis] --WITH (INDEX ([ShipDateOrderDateInd_SeekableForMin])) 
    WHERE [fis].[ShipDateKey] IS NULL;
SELECT [Total Time: Comp Key ShipDate + OrderDate Key] = DATEDIFF(MS, @StartTime, SYSDATETIME());
go 

-- Cleanup
USE [AdventureWorksDW2008_ModifiedSalesKey];
GO

DROP INDEX [FactInternetSales].[ShipDateOrderDateInd];
DROP INDEX [FactInternetSales].[ShipDateOrderDateInd_SeekableForMin];
go

------------------------------------------
-- But, how long does it take to test this?
-- Mostly in the index creation times...

-- What about autopilot?

-- Check out this great article on Simple Talk
-- "Hypothetical Indexes on SQL Server"
-- https://www.simple-talk.com/sql/database-administration/hypothetical-indexes-on-sql-server/
------------------------------------------

CREATE NONCLUSTERED INDEX [TestIndex1]
ON [dbo].[FactInternetSales] 
	([ShipDateKey])
INCLUDE 
	([OrderDateKey])
WITH STATISTICS_ONLY = -1; 
GO

CREATE NONCLUSTERED INDEX [TestIndex2]
ON [dbo].[FactInternetSales]
	 ([ShipDateKey], [OrderDateKey])
WITH STATISTICS_ONLY = -1; 
go

EXEC [sp_SQLskills_helpindex] 'dbo.factinternetsales';
go

SELECT db_id(), object_id('factinternetsales');
GO

DBCC AUTOPILOT(0, 12, 2053582354, 2);
DBCC AUTOPILOT(0, 12, 2053582354, 3);
GO

SET AUTOPILOT ON;
GO

SELECT MIN([fis].[OrderDateKey])
FROM [dbo].[FactInternetSales] AS [fis]
WHERE [fis].[ShipDateKey] IS NULL;
GO

SET AUTOPILOT OFF;
GO

DROP INDEX [dbo].[FactInternetSales].[TestIndex1];
DROP INDEX [dbo].[FactInternetSales].[TestIndex2];
go